sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS 12.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1-arm64/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1-arm64/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_4.1.0  magrittr_2.0.1  fastmap_1.1.0   tools_4.1.0    
##  [5] htmltools_0.5.2 yaml_2.2.1      jquerylib_0.1.4 stringi_1.7.6  
##  [9] rmarkdown_2.11  knitr_1.36      stringr_1.4.0   xfun_0.28      
## [13] digest_0.6.29   rlang_0.4.12    evaluate_0.14
library(tidyverse)
library(lubridate)
library(factoextra)
library(gridExtra)
library(mltools)
library(tmaptools)

set.seed(729)
my.data <- read_csv("Airplane_Crashes_and_Fatalities_Since_1908.csv") %>% 
  tibble()

Data Wrangle

head(my.data)
## # A tibble: 6 × 13
##   Date       Time  Location Operator `Flight #` Route Type  Registration `cn/In`
##   <chr>      <chr> <chr>    <chr>    <chr>      <chr> <chr> <chr>        <chr>  
## 1 09/17/1908 17:18 Fort My… Militar… <NA>       Demo… Wrig… <NA>         1      
## 2 07/12/1912 06:30 Atlanti… Militar… <NA>       Test… Diri… <NA>         <NA>   
## 3 08/06/1913 <NA>  Victori… Private  -          <NA>  Curt… <NA>         <NA>   
## 4 09/09/1913 18:30 Over th… Militar… <NA>       <NA>  Zepp… <NA>         <NA>   
## 5 10/17/1913 10:30 Near Jo… Militar… <NA>       <NA>  Zepp… <NA>         <NA>   
## 6 03/05/1915 01:00 Tienen,… Militar… <NA>       <NA>  Zepp… <NA>         <NA>   
## # … with 4 more variables: Aboard <dbl>, Fatalities <dbl>, Ground <dbl>,
## #   Summary <chr>
tail(my.data)
## # A tibble: 6 × 13
##   Date       Time  Location Operator `Flight #` Route Type  Registration `cn/In`
##   <chr>      <chr> <chr>    <chr>    <chr>      <chr> <chr> <chr>        <chr>  
## 1 05/03/2009 12:00 Near El… Militar… <NA>       Patr… Mi-35 EV08114      <NA>   
## 2 05/20/2009 06:30 Near Ma… Militar… <NA>       Jaka… Lock… A-1325       1982   
## 3 05/26/2009 <NA>  Near Is… Service… <NA>       Goma… Anto… 9Q-CSA       5005   
## 4 06/01/2009 00:15 Atlanti… Air Fra… 447        Rio … Airb… F-GZCP       660    
## 5 06/07/2009 08:30 Near Po… Strait … <NA>       Lour… Brit… C-FJJR       424    
## 6 06/08/2009 <NA>  State o… Militar… <NA>       Mech… Anto… <NA>         <NA>   
## # … with 4 more variables: Aboard <dbl>, Fatalities <dbl>, Ground <dbl>,
## #   Summary <chr>

First, we need to deal with the missing values.

my.data$Time[is.na(my.data$Time)] <- '00:00'
my.data$Time <- str_replace_all(my.data$Time, 'c: ', '')
my.data$Time <- str_replace_all(my.data$Time, 'c:', '')
my.data$Time <- str_replace_all(my.data$Time, 'c', '')
my.data$Time <- str_replace_all(my.data$Time, '\'', ':')
my.data$Time <- str_replace_all(my.data$Time, '\\.', ':')
my.data$Time[!is.na(str_match(my.data$Time, '[0-9]{4}'))]
## [1] "0943"

Modify the time 0943.

my.data$Time <- str_replace_all(my.data$Time, '0943', '09:43')
my.data$Time[!is.na(str_match(my.data$Time, '[0-9]{3}'))]
## [1] "114:20"
my.data$Time <- str_replace_all(my.data$Time, '114:20', '00:00')

Modify 114:20.

my.data$Time <- paste(my.data$Date, my.data$Time)
my.data$Time <- mdy_hm(my.data$Time)
my.data <- my.data %>%
  select(-c(`Flight #`, Route, Registration, `cn/In`, Summary)) %>%
  na.omit()

Visualizing

my.data %>%
  group_by(year(Time)) %>%
  summarise(count = n()) %>%
  ggplot(aes(`year(Time)`, count)) +
  geom_line(color = "gray52") +
  geom_point(color = "sienna1") +
  theme_light() +
  xlab("Year") +
  ylab("Counts")

The number of air crashes was higher from 1940 to 2005. Next, let’s see if the crash is related to a specific number of months, days of the week, or a specific time period.

p.week <- my.data %>%
  group_by(wday(Time, week_start = 1, label = TRUE)) %>%
  summarise(count = n()) %>%
  ggplot(aes(`wday(Time, week_start = 1, label = TRUE)`, count)) +
  geom_col(fill = "#90AFC5", width = 0.8) +
  theme_light() +
  xlab("Day of Week") +
  ylab("Counts")
p.month <- my.data %>%
  group_by(month(Time, label = TRUE)) %>%
  summarise(count = n()) %>%
  ggplot(aes(`month(Time, label = TRUE)`, count)) +
  geom_col(fill = "#336B87", width = 0.8) +
  theme_light() +
  xlab("Month") +
  ylab("Counts")
p.day <- my.data %>%
  filter(hour(Time) != 0 | minute(Time) != 0) %>%
  group_by(hour(Time)) %>%
  summarise(count = n()) %>%
  ggplot(aes(`hour(Time)`, count)) +
  geom_col(fill = "#2A3132", width = 0.8) +
  theme_light() +
  xlab("Hour") +
  ylab("Counts") +
  scale_x_continuous(breaks = seq(0, 24, 4))

A significant increase in the number of air crashes can be seen during certain time periods. But we don’t know the exact reason for this increase.

It is possible that some factors during these specific periods of time have led to a higher probability of air crashes. It could also be that the number of flights increased during the specific time period, which led to an increase in the number of crashes.

# library(Rmisc)
# multiplot(p.week, p.month, p.day, layout = matrix(c(1, 2, 3, 3),
#                                                   nrow = 2, byrow = TRUE))
my.data <- my.data %>%
  mutate(Survivors = Aboard - Fatalities,
         SurvivalRate = Survivors / Aboard) %>% 
  na.omit()
summary(my.data)
##      Date                Time                       Location        
##  Length:5179        Min.   :1908-09-17 17:18:00   Length:5179       
##  Class :character   1st Qu.:1954-12-20 18:30:00   Class :character  
##  Mode  :character   Median :1973-06-03 00:00:00   Mode  :character  
##                     Mean   :1972-01-31 13:13:04                     
##                     3rd Qu.:1990-08-28 00:47:00                     
##                     Max.   :2009-06-08 00:00:00                     
##    Operator             Type               Aboard         Fatalities    
##  Length:5179        Length:5179        Min.   :  1.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  5.00   1st Qu.:  3.00  
##  Mode  :character   Mode  :character   Median : 13.00   Median :  9.00  
##                                        Mean   : 27.76   Mean   : 20.21  
##                                        3rd Qu.: 30.00   3rd Qu.: 23.00  
##                                        Max.   :644.00   Max.   :583.00  
##      Ground         Survivors        SurvivalRate  
##  Min.   :   0.0   Min.   :  0.000   Min.   :0.000  
##  1st Qu.:   0.0   1st Qu.:  0.000   1st Qu.:0.000  
##  Median :   0.0   Median :  0.000   Median :0.000  
##  Mean   :   1.6   Mean   :  7.553   Mean   :0.166  
##  3rd Qu.:   0.0   3rd Qu.:  2.000   3rd Qu.:0.200  
##  Max.   :2750.0   Max.   :516.000   Max.   :1.000
top_n(count(my.data, Location, sort = TRUE), 10)
## Selecting by n
## # A tibble: 11 × 2
##    Location                   n
##    <chr>                  <int>
##  1 Sao Paulo, Brazil         15
##  2 Moscow, Russia            14
##  3 Anchorage, Alaska         13
##  4 Bogota, Colombia          13
##  5 Manila, Philippines       13
##  6 Cairo, Egypt              12
##  7 Chicago, Illinois         11
##  8 New York, New York        11
##  9 Rio de Janeiro, Brazil    11
## 10 AtlantiOcean               9
## 11 Tehran, Iran               9
top_n(count(my.data, Operator, sort = TRUE), 10)
## Selecting by n
## # A tibble: 11 × 2
##    Operator                            n
##    <chr>                           <int>
##  1 Aeroflot                          176
##  2 Military - U.S. Air Force         174
##  3 Air France                         67
##  4 Deutsche Lufthansa                 62
##  5 Air Taxi                           44
##  6 Military - U.S. Army Air Forces    43
##  7 United Air Lines                   43
##  8 Pan American World Airways         40
##  9 American Airlines                  36
## 10 Military - Royal Air Force         36
## 11 Military - U.S. Navy               36
top_n(count(my.data, Type, sort = TRUE), 10)
## Selecting by n
## # A tibble: 11 × 2
##    Type                                         n
##    <chr>                                    <int>
##  1 Douglas DC-3                               331
##  2 de Havilland Canada DHC-6 Twin Otter 300    81
##  3 Douglas C-47A                               73
##  4 Douglas C-47                                60
##  5 Douglas DC-4                                40
##  6 Yakovlev YAK-40                             37
##  7 Antonov AN-26                               36
##  8 Junkers JU-52/3m                            31
##  9 Douglas C-47B                               29
## 10 De Havilland DH-4                           27
## 11 Douglas DC-6B                               27
temp.data <- my.data %>%
  group_by(year(Time)) %>%
  summarise(Aboard = sum(Aboard),
            Fatalities = sum(Fatalities)) %>%
  ungroup() %>%
  mutate(Sur = Aboard - Fatalities,
         SurRate = Sur / Aboard) %>%
  na.omit()

temp.data %>%
  ggplot(aes(`year(Time)`, SurRate)) +
  geom_col(fill = 'gray60') +
  theme_light() +
  xlab('Years') +
  ylab('Survival Rate') +
  geom_hline(yintercept = 0.166) +
  geom_text(
    aes(x = 1908, y = 0.166),
    label = 'Average',
    vjust = 1.5,
    hjust = 0.5,
    color = 'indianred1'
  )

cluster.data <- my.data %>%
  select(c(6:7, 9)) %>%
  scale()
k2 <- kmeans(cluster.data, centers = 2, nstart = 25)
k3 <- kmeans(cluster.data, centers = 3, nstart = 25)
k4 <- kmeans(cluster.data, centers = 4, nstart = 25)
k5 <- kmeans(cluster.data, centers = 5, nstart = 25)
p2 <-
  fviz_cluster(k2, geom = "point", data = cluster.data) + ggtitle("K-means k=2") +
  theme_light()
p3 <-
  fviz_cluster(k3, geom = "point", data = cluster.data) + ggtitle("K-means k=3") +
  theme_light()
p4 <-
  fviz_cluster(k4, geom = "point", data = cluster.data) + ggtitle("K-means k=4") +
  theme_light()
p5 <-
  fviz_cluster(k5, geom = "point", data = cluster.data) + ggtitle("K-means k=5") +
  theme_light()

grid.arrange(p2, p3, p4, p5)

fviz_nbclust(cluster.data, kmeans, method = "wss") +
  geom_vline(xintercept = 3, linetype = 2) +
  theme_light() +
  theme(title = element_blank())

clustered.data <- tibble(my.data, cluster = k3$cluster)
(summary_cluster <- clustered.data %>%
  group_by(cluster) %>%
  summarise(
    Counts = n(),
    Mean_Aboard = mean(Aboard),
    Mean_Fatalities = mean(Fatalities),
    Mean_Sur_Rate = mean(SurvivalRate)
  ))
## # A tibble: 3 × 5
##   cluster Counts Mean_Aboard Mean_Fatalities Mean_Sur_Rate
##     <int>  <int>       <dbl>           <dbl>         <dbl>
## 1       1    111       178.             12.7        0.930 
## 2       2   4723        17.2            13.3        0.156 
## 3       3    345       125.            118.         0.0532
or <- summary_cluster$Mean_Sur_Rate %>% 
  rank()
clustered.data$cluster_id <-
  ifelse(
    clustered.data$cluster == or[3],
    "Large Passenger High Survival",
    ifelse(
      clustered.data$cluster == or[1],
      "Large Passenger High Fatality",
      "Small-Midsize Crashes"
    )
  )
LgSurvive <-
  clustered.data[clustered.data$cluster_id == "Large Passenger High Survival", ]
LgFatal <-
  clustered.data[clustered.data$cluster_id == "Large Passenger High Fatality", ]

Top10TypeSurvive <-
  top_n(count(LgSurvive, Type, sort = TRUE), 10)
## Selecting by n
Top10TypeFatal <-
  top_n(count(LgFatal, Type, sort = TRUE), 10)
## Selecting by n
Top10OperSurvive <-
  top_n(count(LgSurvive, Operator, sort = TRUE), 10)
## Selecting by n
Top10OperFatal <- top_n(count(LgFatal, Operator, sort = TRUE), 10)
## Selecting by n
Top10TypeSurvive %>%
  ggplot(aes(x = reorder(Type, n), y = n)) +
  geom_col(fill = "mediumseagreen") +
  coord_flip() +
  # ggtitle("Top 10 Large Passenger High Survival Crashes by Model") +
  xlab("Plane Model") +
  ylab("Number of Crashes") +
  theme_light()

Top10TypeFatal %>%
  ggplot(aes(x = reorder(Type, n), y = n)) +
  geom_col(fill = "orangered3") +
  coord_flip() +
  # ggtitle("Top 10 Large Passenger High Fatality Crashes by Model") +
  xlab("Plane Model") +
  ylab("Number of Crashes") +
  theme_light()

ggplot(Top10OperSurvive, aes(x = reorder(Operator, n), y = n)) +
  geom_col(fill = "mediumseagreen") +
  coord_flip() +
  # ggtitle("Top Ten Large Passenger High Survival Crashes by Airline") +
  xlab("Operator") +
  ylab("Number of Crashes") +
  theme_light()

ggplot(Top10OperFatal, aes(x = reorder(Operator, n), y = n)) +
  geom_col(fill = "orangered3") +
  coord_flip() +
  # ggtitle("Top Ten Large Passenger High Fatality Crashes by Airline") +
  xlab("Operator") +
  ylab("Number of Crashes") +
  theme_light()

Locations <-
  as.character(clustered.data$Location[clustered.data$cluster != 2])
Locations <- gsub("Near ", "", Locations)
Locations <- gsub("Off ", "", Locations)
Locations <- gsub("off ", "", Locations)
Locations <- gsub("Czechoslovakia", "Czechia", Locations)
Locations <- gsub("AFB", "", Locations)
Locations <- gsub("Llandow Airport, ", "", Locations)
Locations <- gsub("La Guardia ", "LaGuardia ", Locations)
Locations <- gsub("Staten Island / ", "", Locations)
Locations <- gsub("Morrocco", "Morocco", Locations)
Locations <- gsub("Guadaloupe", "Guadeloupe", Locations)
Locations <- gsub("USSR", "Russia", Locations)
Locations <- gsub("Rio de Janerio", "Rio de Janeiro", Locations)
Locations <- gsub("Papanga", "Pampanga", Locations)
Locations <- gsub("Island of ", "", Locations)
Locations <- gsub(", Yugoslavia", "", Locations)
Locations <- gsub("near Roussillon, ", "", Locations)
Locations <- gsub("Covington/Hebron, ", "", Locations)
Locations <- gsub(" (Namibia)", "", Locations)
Locations <- gsub("Morioko", "Morioka", Locations)
Locations <- gsub("Mt. Fuji, ", "", Locations)
Locations <- gsub("Yuhnov", "Yukhnov", Locations)
Locations <- gsub("Ukraine, Russia", "Ukraine", Locations)
Locations <- gsub("Staines, Surrey, ", "", Locations)
Locations <- gsub("Massachusett", "Massachusetts", Locations)
Locations <- gsub("Corunda", "Coruna", Locations)
Locations <- gsub("Thirty-five miles west of ", "", Locations)
Locations <- gsub("Mountains ", "", Locations)
Locations <- gsub(" near Kampung Ladang, ", "", Locations)
Locations <- gsub("Elburz Mtns., near ", "", Locations)
Locations <- gsub("New York, New York", "New York", Locations)
Locations <- gsub("300 nm NW of ", "", Locations)
Locations <- gsub("950 nm S of  ", "", Locations)
Locations <- gsub("Western PacifiOcean, ", "", Locations)
Locations <- gsub("PacifiOcean", "Pacific Ocean", Locations)
Locations <- gsub("AtlantiOcean", "Atlantic Ocean", Locations)
Locations <- gsub("Over the ", "", Locations)
Locations <- gsub("110 miles SW of ", "", Locations)
Locations <-
  gsub("DemocratiRepubliCogo",
       "Democratic Republic of Congo",
       Locations)
Locations <- gsub("Calilfornia", "California", Locations)
Locations <-
  gsub(", 570 miles northeast of Natal, Brazil", "", Locations)
Locations <-
  gsub(", 100 miles W of Galway Bay, Ireland", "", Locations)
Locations <- gsub(", 110 miles West of Ireland", "", Locations)
Locations <-
  gsub(", 116 miles WSW of Annette Island, Alaska", "", Locations)
Locations <- gsub("Mariana Islands", "", Locations)
Locations <- gsub("Atalayasa", "Atalaya", Locations)
Locations <- gsub("Wusterausen", "Wusterhausen", Locations)
Locations <- gsub("Vrastsa", "Vratsa", Locations)
Locations <- gsub("Dneprodzerzhinsk", "Kamianske", Locations)
Locations <- gsub("At ", "", Locations)
Locations <- gsub("near Ajaccio, ", "", Locations)
Locations <- gsub("En route Miami, FL - ", "", Locations)
Locations <- gsub("near Ueno Village, ", "", Locations)
Locations <- gsub("French Alps, ", "", Locations)
Locations <-
  gsub("Kimpo Air Base", "Gimpo International Airport", Locations)
Locations <- gsub("Carpich", "Carpish", Locations)
Locations <- gsub(" (Namibia)", "", Locations)
Locations <-
  gsub("Al Fujayrah", "Fujairah International Airport", Locations)
Locations <- gsub(", Kefallinia, Greece", "", Locations)
Locations <- gsub("Karatepe Mountains", "Karatepe", Locations)
Locations <- gsub("Jeddah, ", "", Locations)
Locations <- gsub("Laskarak, ", "", Locations)
Locations <- gsub(", Ustica, Italy, ", "", Locations)
Locations <- gsub("Southern Belarus", "Belarus", Locations)
Locations <- gsub("Syktyvar", "Syktyvkar", Locations)
Locations <- gsub("Ko Phuket", "Phuket", Locations)
Locations <- gsub("Krosnovodsk", "Krasnovodsk", Locations)
Locations <- gsub("Combi, ", "", Locations)
Locations <- gsub("Persian Gulf, near ", "", Locations)
Locations <- gsub("Armenia, ", "", Locations)
Locations <- gsub("Mt. Saint-Odile, near ", "", Locations)
Locations <- gsub("Mt. Lalaboy,  ", "", Locations)
Locations <- gsub("Khorog", "", Locations)
Locations <- gsub("Tidjika", "Tidjikja", Locations)
Locations <- gsub("Kahengula, ", "", Locations)
Locations <- gsub("Domincan", "Dominican", Locations)
Locations <- gsub("Charkhidadri", "Charkhi Dadri", Locations)
Locations <- gsub("Massachusettss", "Massachusetts", Locations)
Locations <- gsub("Amritsar, India / ", "", Locations)
Locations <- gsub("Republiof", "Republic of", Locations)
Locations <- gsub("Khorramabed", "Khorramabad", Locations)

# coords <- geocode_OSM(Locations, keep.unfound = TRUE)
# 
# saveRDS(coords, file = "coords.RData")

coords <- readRDS(file = "coords.RData")
map.data <- clustered.data[clustered.data$cluster != 2, ]

map.data$Latitude <- coords$lat
map.data$Longitude <- coords$lon

map.data <- na.omit(map.data)
map_data("world") %>%
  ggplot() +
  coord_fixed() +
  geom_polygon(aes(x = long, y = lat, group = group),
               fill = "white",
               colour = "grey50") +
  geom_point(
    data = map.data,
    aes(
      x = Longitude,
      y = Latitude,
      color = cluster_id,
      size = 1 - SurvivalRate
    ),
    alpha = 0.2
  ) +
  theme_light() +
  # ggtitle("Large Passenger Plane Crash Locations 1933 - 2009") +
  scale_color_manual("Cluster", values = alpha(c("orangered3", "green"))) +
  theme(legend.position = "none") +
  scale_size(range = c(0, 2.5))